## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.2.1.9000 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: digest
## Loading required package: cluster
## Loading required package: factoextra
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
##
## Attaching package: 'moonBook'
## The following object is masked from 'package:lattice':
##
## densityplot
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
##
## Attaching package: 'sjmisc'
## The following object is masked from 'package:Hmisc':
##
## %nin%
## The following objects are masked from 'package:janitor':
##
## remove_empty_cols, remove_empty_rows
## The following object is masked from 'package:purrr':
##
## is_empty
## The following object is masked from 'package:tidyr':
##
## replace_na
## The following object is masked from 'package:tibble':
##
## add_case
##
## Attaching package: 'ggiraphExtra'
## The following objects are masked from 'package:moonBook':
##
## addLabelDf, getMapping
##
## Attaching package: 'scales'
## The following object is masked from 'package:moonBook':
##
## comma
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Registering fonts with R
## Parsed with column specification:
## cols(
## .default = col_double(),
## R_fighter = col_character(),
## B_fighter = col_character(),
## Referee = col_character(),
## date = col_date(format = ""),
## location = col_character(),
## Winner = col_character(),
## title_bout = col_logical(),
## weight_class = col_character(),
## B_Stance = col_character(),
## R_Stance = col_character()
## )
## See spec(...) for full column specifications.
## # A tibble: 5,144 x 145
## r_fighter b_fighter referee date location winner title_bout
## <chr> <chr> <chr> <date> <chr> <chr> <lgl>
## 1 Henry Ce… Marlon M… Marc G… 2019-06-08 Chicago… Red TRUE
## 2 Valentin… Jessica … Robert… 2019-06-08 Chicago… Red TRUE
## 3 Tony Fer… Donald C… Dan Mi… 2019-06-08 Chicago… Red FALSE
## 4 Jimmie R… Petr Yan Kevin … 2019-06-08 Chicago… Blue FALSE
## 5 Tai Tuiv… Blagoy I… Dan Mi… 2019-06-08 Chicago… Blue FALSE
## 6 Tatiana … Nina Ans… Robert… 2019-06-08 Chicago… Red FALSE
## 7 Aljamain… Pedro Mu… Marc G… 2019-06-08 Chicago… Red FALSE
## 8 Karolina… Alexa Gr… Kevin … 2019-06-08 Chicago… Blue FALSE
## 9 Ricardo … Calvin K… Dan Mi… 2019-06-08 Chicago… Blue FALSE
## 10 Yan Xiao… Angela H… Robert… 2019-06-08 Chicago… Red FALSE
## # … with 5,134 more rows, and 138 more variables: weight_class <chr>,
## # no_of_rounds <dbl>, b_current_lose_streak <dbl>,
## # b_current_win_streak <dbl>, b_draw <dbl>, b_avg_body_att <dbl>,
## # b_avg_body_landed <dbl>, b_avg_clinch_att <dbl>, b_avg_clinch_landed <dbl>,
## # b_avg_distance_att <dbl>, b_avg_distance_landed <dbl>,
## # b_avg_ground_att <dbl>, b_avg_ground_landed <dbl>, b_avg_head_att <dbl>,
## # b_avg_head_landed <dbl>, b_avg_kd <dbl>, b_avg_leg_att <dbl>,
## # b_avg_leg_landed <dbl>, b_avg_pass <dbl>, b_avg_rev <dbl>,
## # b_avg_sig_str_att <dbl>, b_avg_sig_str_landed <dbl>,
## # b_avg_sig_str_pct <dbl>, b_avg_sub_att <dbl>, b_avg_td_att <dbl>,
## # b_avg_td_landed <dbl>, b_avg_td_pct <dbl>, b_avg_total_str_att <dbl>,
## # b_avg_total_str_landed <dbl>, b_longest_win_streak <dbl>, b_losses <dbl>,
## # b_avg_opp_body_att <dbl>, b_avg_opp_body_landed <dbl>,
## # b_avg_opp_clinch_att <dbl>, b_avg_opp_clinch_landed <dbl>,
## # b_avg_opp_distance_att <dbl>, b_avg_opp_distance_landed <dbl>,
## # b_avg_opp_ground_att <dbl>, b_avg_opp_ground_landed <dbl>,
## # b_avg_opp_head_att <dbl>, b_avg_opp_head_landed <dbl>, b_avg_opp_kd <dbl>,
## # b_avg_opp_leg_att <dbl>, b_avg_opp_leg_landed <dbl>, b_avg_opp_pass <dbl>,
## # b_avg_opp_rev <dbl>, b_avg_opp_sig_str_att <dbl>,
## # b_avg_opp_sig_str_landed <dbl>, b_avg_opp_sig_str_pct <dbl>,
## # b_avg_opp_sub_att <dbl>, b_avg_opp_td_att <dbl>, b_avg_opp_td_landed <dbl>,
## # b_avg_opp_td_pct <dbl>, b_avg_opp_total_str_att <dbl>,
## # b_avg_opp_total_str_landed <dbl>, b_total_rounds_fought <dbl>,
## # b_total_time_fought_seconds <dbl>, b_total_title_bouts <dbl>,
## # b_win_by_decision_majority <dbl>, b_win_by_decision_split <dbl>,
## # b_win_by_decision_unanimous <dbl>, b_win_by_ko_tko <dbl>,
## # b_win_by_submission <dbl>, b_win_by_tko_doctor_stoppage <dbl>,
## # b_wins <dbl>, b_stance <chr>, b_height_cms <dbl>, b_reach_cms <dbl>,
## # b_weight_lbs <dbl>, r_current_lose_streak <dbl>,
## # r_current_win_streak <dbl>, r_draw <dbl>, r_avg_body_att <dbl>,
## # r_avg_body_landed <dbl>, r_avg_clinch_att <dbl>, r_avg_clinch_landed <dbl>,
## # r_avg_distance_att <dbl>, r_avg_distance_landed <dbl>,
## # r_avg_ground_att <dbl>, r_avg_ground_landed <dbl>, r_avg_head_att <dbl>,
## # r_avg_head_landed <dbl>, r_avg_kd <dbl>, r_avg_leg_att <dbl>,
## # r_avg_leg_landed <dbl>, r_avg_pass <dbl>, r_avg_rev <dbl>,
## # r_avg_sig_str_att <dbl>, r_avg_sig_str_landed <dbl>,
## # r_avg_sig_str_pct <dbl>, r_avg_sub_att <dbl>, r_avg_td_att <dbl>,
## # r_avg_td_landed <dbl>, r_avg_td_pct <dbl>, r_avg_total_str_att <dbl>,
## # r_avg_total_str_landed <dbl>, r_longest_win_streak <dbl>, r_losses <dbl>,
## # r_avg_opp_body_att <dbl>, r_avg_opp_body_landed <dbl>, …
data <- data %>%
mutate(women = ifelse(grepl("Women", weight_class), TRUE, FALSE))
fight_data <- data %>%
dplyr::select(1:9)
#Splitting into men and Women
data_men <- data %>%
subset(women == FALSE)
data_women <- data %>%
subset(women == TRUE)
#Splitting data into Fight and Fighter Data
blue_profile_data <- data_men %>%
dplyr::select(Referee, date, location, starts_with("B_")) %>%
mutate(color = "blue")
red_profile_data <- data_men %>%
dplyr::select(Referee, date, location, starts_with("R_")) %>%
mutate(color = "red")
#Cleaning column names
names(blue_profile_data) <- ifelse(grepl("B_", names(blue_profile_data)), substring(names(blue_profile_data), 3), names(blue_profile_data))
names(red_profile_data) <- ifelse(grepl("R_", names(red_profile_data)), substring(names(red_profile_data), 3), names(red_profile_data))
#Unifying the Fighter Datasets
fighter_profile_data <- rbind(blue_profile_data, red_profile_data) %>%
mutate(Stance = as.factor(Stance)) %>%
rename(win_by_ko_tko = `win_by_KO/TKO`,
total_time_fought_seconds = `total_time_fought(seconds)`)
# Turning data relative to number of fights
fighter_profile_data <- fighter_profile_data %>%
mutate(win_by_Decision_Majority = win_by_Decision_Majority/wins,
win_by_Decision_Split = win_by_Decision_Split/wins,
win_by_Decision_Unanimous = win_by_Decision_Unanimous/wins,
win_by_Submission = win_by_Submission/wins,
win_by_ko_tko = win_by_ko_tko/wins,
win_by_TKO_Doctor_Stoppage = win_by_TKO_Doctor_Stoppage/wins
) %>%
mutate(win_by_ko_tko = win_by_ko_tko + win_by_TKO_Doctor_Stoppage,
win_by_Decision = win_by_Decision_Majority + win_by_Decision_Split + win_by_Decision_Unanimous) %>%
dplyr::select(-win_by_Decision_Majority, -win_by_Decision_Split, -win_by_Decision_Unanimous, -win_by_TKO_Doctor_Stoppage)#Scaling the Data:
fighter_profile_data_reduced <- fighter_profile_data %>%
dplyr::select(Referee, #Do not Remove!
date, #Do not Remove!
location, #Do not Remove!
fighter, #Do not Remove!
Reach_cms, #Please change parameters to be included in clustering here
Weight_lbs,
ends_with("_att"),
avg_REV,
avg_PASS,
avg_KD,
win_by_Decision,
win_by_ko_tko,
win_by_Submission
) %>%
#what do avg_REV, avg_PASS, avg_KD measure??
dplyr::select(-starts_with("avg_opp")) %>%
dplyr::select(-avg_TOTAL_STR_att, -avg_DISTANCE_att, -avg_SIG_STR_att) %>%
na.omit() #filtering all fighters with missing data
fighter_profile_data_scaled <- fighter_profile_data_reduced %>%
dplyr::select(-Referee,
-date,
-location,
-fighter)
fighter_profile_data_scaled <- data.frame(scale(fighter_profile_data_scaled))fighter_profile_data_scaled %>%
ggcorr(method = c("pairwise", "pearson"), layout.exp = 3,label_round=2, label = TRUE,label_size = 2,hjust = 1)fviz_nbclust(fighter_profile_data_scaled, kmeans, method = "silhouette", k.max = 15) +
labs(subtitle = "Silhouette method")## Warning: did not converge in 10 iterations
model_km2 <- kmeans(fighter_profile_data_scaled, centers = 2, nstart = 50, iter.max = 100)
model_km3 <- kmeans(fighter_profile_data_scaled, centers = 3, nstart = 50, iter.max = 100)
model_km4 <- kmeans(fighter_profile_data_scaled, centers = 4, nstart = 50, iter.max = 100)
model_km5 <- kmeans(fighter_profile_data_scaled, centers = 5, nstart = 50, iter.max = 100)#add clusters to the data frame
fighter_profile_data_scaled_withClusters <- mutate(fighter_profile_data_scaled, cluster = as.factor(model_km2$cluster))
#First let's find the averages of the variables by cluster
center_locations <- fighter_profile_data_scaled_withClusters %>%
group_by(cluster) %>%
summarize_at(vars(Reach_cms:win_by_Submission),mean)
#Next I use gather to collect information together
xa2 <- gather(center_locations, key = "variable", value = "value", -cluster, factor_key = TRUE)
#Next I use ggplot to visualize centers
knnCenters <- ggplot(xa2, aes(x = variable, y = value)) +
geom_line(aes(color = cluster,group = cluster), linetype = "dashed",size=1) +
geom_point(size=2,shape=4) +
geom_hline(yintercept=0) +
ggtitle("K-means centers k=2") +
labs(fill = "Cluster") +
theme(text = element_text(size=10),
axis.text.x = element_text(angle=45,hjust=1),
legend.title=element_text(size=5),
legend.text = element_text(size=5)
)
knnCenters#add clusters to the data frame
fighter_profile_data_scaled_withClusters <- mutate(fighter_profile_data_scaled, cluster = as.factor(model_km4$cluster))
#Your code here
#First let's find the averages of the variables by cluster
center_locations <- fighter_profile_data_scaled_withClusters %>%
group_by(cluster) %>%
summarize_at(vars(Reach_cms:win_by_Submission),mean)
#Next I use gather to collect information together
xa4 <- gather(center_locations, key = "variable", value = "value", -cluster, factor_key = TRUE)
#Next I use ggplot to visualize centers
knnCenters <- ggplot(xa4, aes(x = variable, y = value)) +
geom_line(aes(color = cluster,group = cluster), linetype = "dashed",size=1) +
geom_point(size=2,shape=4) +
geom_hline(yintercept=0) +
ggtitle("K-means centers k=4") +
labs(fill = "Cluster") +
theme(text = element_text(size=10),
axis.text.x = element_text(angle=45,hjust=1),
legend.title=element_text(size=5),
legend.text = element_text(size=5)
)
knnCenters## [1] 3016 3626
## [1] 2300 2339 2003
## [1] 2052 1426 1460 1704
## [1] 1387 1215 1513 1356 1171
g2 <- fviz_cluster(model_km2, fighter_profile_data_scaled, palette = "Set2", ggtheme = theme_minimal())
g3 <- fviz_cluster(model_km3, fighter_profile_data_scaled, palette = "Set2", ggtheme = theme_minimal())
g4 <- fviz_cluster(model_km4, fighter_profile_data_scaled, palette = "Set2", ggtheme = theme_minimal())
g5 <- fviz_cluster(model_km5, fighter_profile_data_scaled, palette = "Set2", ggtheme = theme_minimal())
grid.arrange(g2,g3,g4,g5)##Combining the Data
The following code integrates the cluster in the original fighter-dataframe
fighter_profile_data_reduced_withClusters <- mutate(fighter_profile_data_reduced, cluster = as.factor(model_km4$cluster))
temp <- fighter_profile_data_reduced_withClusters %>%
dplyr::select(Referee, date, location, fighter, cluster)
fighter_profile_data <- fighter_profile_data %>%
left_join(temp, by = c("fighter", "location", "date", "Referee")) %>%
mutate(new_fighter = ifelse(is.na(total_time_fought_seconds), 1,0),
vet_but_no_wins = ifelse(!is.na(total_time_fought_seconds) & wins==0, 1, 0))This is a sanity check. The function should always output 0:
## Warning in is.na(cluster): is.na() applied to non-(list or vector) of type
## 'closure'
## [1] 0
## Parsed with column specification:
## cols(
## .default = col_character(),
## R_KD = col_double(),
## B_KD = col_double(),
## R_SUB_ATT = col_double(),
## B_SUB_ATT = col_double(),
## R_PASS = col_double(),
## B_PASS = col_double(),
## R_REV = col_double(),
## B_REV = col_double(),
## last_round = col_double(),
## last_round_time = col_time(format = "")
## )
## See spec(...) for full column specifications.
## # A tibble: 5,144 x 41
## r_fighter b_fighter r_kd b_kd r_sig_str b_sig_str r_sig_str_pct
## <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 Henry Ce… Marlon M… 0 0 90 of 171 57 of 119 52%
## 2 Valentin… Jessica … 1 0 8 of 11 2 of 12 72%
## 3 Tony Fer… Donald C… 0 0 104 of 2… 68 of 185 52%
## 4 Jimmie R… Petr Yan 0 2 73 of 192 56 of 189 38%
## 5 Tai Tuiv… Blagoy I… 0 1 64 of 144 73 of 123 44%
## 6 Tatiana … Nina Ans… 0 0 75 of 142 48 of 99 52%
## 7 Aljamain… Pedro Mu… 0 0 174 of 3… 105 of 2… 49%
## 8 Karolina… Alexa Gr… 0 0 90 of 232 148 of 3… 38%
## 9 Ricardo … Calvin K… 0 1 12 of 29 22 of 41 41%
## 10 Yan Xiao… Angela H… 0 0 94 of 249 71 of 144 37%
## # … with 5,134 more rows, and 34 more variables: b_sig_str_pct <chr>,
## # r_total_str <chr>, b_total_str <chr>, r_td <chr>, b_td <chr>,
## # r_td_pct <chr>, b_td_pct <chr>, r_sub_att <dbl>, b_sub_att <dbl>,
## # r_pass <dbl>, b_pass <dbl>, r_rev <dbl>, b_rev <dbl>, r_head <chr>,
## # b_head <chr>, r_body <chr>, b_body <chr>, r_leg <chr>, b_leg <chr>,
## # r_distance <chr>, b_distance <chr>, r_clinch <chr>, b_clinch <chr>,
## # r_ground <chr>, b_ground <chr>, win_by <chr>, last_round <dbl>,
## # last_round_time <time>, format <chr>, referee <chr>, date <chr>,
## # location <chr>, fight_type <chr>, winner <chr>
total_fight_data <- total_fight_data %>%
mutate(date = mdy(date)) %>%
dplyr::select(-Winner, -Fight_type) %>%
left_join(fight_data, by = c("R_fighter", "B_fighter", "Referee", "date", "location")) %>%
mutate(women = ifelse(grepl("Women", weight_class), TRUE, FALSE))
total_fight_data_men <- total_fight_data %>%
subset(women == FALSE)
total_fight_data_women <- total_fight_data %>%
subset(women == TRUE)total_fight_data_men$R_cluster <- NA
total_fight_data_men$B_cluster <- NA
total_fight_data_men$R_cluster <- fighter_profile_data$cluster[match(paste(total_fight_data_men$location, total_fight_data_men$Referee, total_fight_data_men$date, total_fight_data_men$R_fighter), paste(fighter_profile_data$location, fighter_profile_data$Referee, fighter_profile_data$date, fighter_profile_data$fighter))]
total_fight_data_men$B_cluster <- fighter_profile_data$cluster[match(paste(total_fight_data_men$location, total_fight_data_men$Referee, total_fight_data_men$date, total_fight_data_men$B_fighter), paste(fighter_profile_data$location, fighter_profile_data$Referee, fighter_profile_data$date, fighter_profile_data$fighter))]
sum(is.na(total_fight_data_men$R_cluster)) #1141## [1] 1141
## [1] 1877
## [1] 3014
##
## Attaching package: 'maps'
## The following object is masked from 'package:cluster':
##
## votes.repub
## The following object is masked from 'package:purrr':
##
## map
## Loading required package: sp
## Checking rgeos availability: TRUE
##
## Attaching package: 'maptools'
## The following object is masked from 'package:Hmisc':
##
## label
## rgeos version: 0.5-2, (SVN revision 621)
## GEOS runtime version: 3.7.2-CAPI-1.11.2
## Linking to sp version: 1.3-1
## Polygon checking: TRUE
##
## Attaching package: 'rgeos'
## The following object is masked from 'package:Hmisc':
##
## translate
## rgdal: version: 1.4-6, (SVN revision 841)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: FALSE
## Loaded PROJ.4 runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/proj
## Linking to sp version: 1.3-1
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
##
## Attaching package: 'spdep'
## The following object is masked from 'package:fmsb':
##
## geary.test
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(opencage)
library(sf)
library(gganimate)
library(hrbrthemes)
map_data <- total_fight_data_men %>%
group_by(location) %>%
summarise(count = n(),
first_fight = min(date))
latitudes <- numeric(0)
longitudes <- numeric(0)
locations <- map_data$location
for (i in locations){
temp = opencage_forward(i, key = OPENCAGE_KEY)
lat = median(temp$results$geometry.lat)
lng = median(temp$results$geometry.lng)
latitudes[i] = lat
longitudes[i] = lng
}
map_data <- map_data %>%
mutate(lat = latitudes,
lng = longitudes)
world <- ne_countries(scale = "medium", returnclass = "sf") %>%
filter(name != "Antarctica")
plt <- ggplot(world) +
geom_sf() +
geom_point(data=map_data,
aes(x=lng,
y=lat,
size=count,
color=first_fight,
group = seq_along(first_fight)))+
labs(title = "The History of UFC: How fights have expanded globally over time")+
theme(text=element_text(size = 13, family="Tahoma"))+
transition_reveal(first_fight) +
shadow_mark(alpha = 0.3, size = 0.5)
plt